#!/usr/bin/env python

from Bio import SeqIO
import sys
import argparse
import os
import gzip

parser = argparse.ArgumentParser(description='This script is for parsing a fasta file by pulling out sequences with the desired headers. If you want all sequences EXCEPT the ones with the headers you are providing, add the flag "--inverse". For version info, run `bit-version`.')

required = parser.add_argument_group('required arguments')

required.add_argument("-i", "--input_fasta", help="Original fasta file", action="store", dest="input_fasta", required=True)
required.add_argument("-w", "--sequence_headers", help="Single-column file with sequence headers", action="store", dest="headers", required=True)
parser.add_argument("-o", "--output_fasta", help='Output fasta file default: "Wanted.fa"', action="store", dest="output_fasta", default="Wanted.fa")
parser.add_argument("--inverse", help="Add this flag to pull out all sequences with headers NOT in the provided header file.", action="store_true")
parser.add_argument("--gz", help="Add this flag if the input is gzipped (does not gzip output)", action="store_true")

if len(sys.argv)==1:
    parser.print_help(sys.stderr)
    sys.exit(0)
    
args = parser.parse_args()

if not args.gz:

    fasta_in = open(args.input_fasta, "r")

else:

    fasta_in = gzip.open(args.input_fasta, "rt")


headers_of_int = open(args.headers, "r")

headers_of_int_set = set(line.strip() for line in headers_of_int)
headers_of_int.close()

fasta_out = open(args.output_fasta, "w")

if not args.inverse:

    for seq_record in SeqIO.parse(fasta_in, "fasta"):
        if seq_record.id in headers_of_int_set:
            fasta_out.write(">" + seq_record.id + "\n")
            fasta_out.write(str(seq_record.seq) + "\n")

else:

    for seq_record in SeqIO.parse(fasta_in, "fasta"):
        if seq_record.id not in headers_of_int_set:
            fasta_out.write(">" + seq_record.id + "\n")
            fasta_out.write(str(seq_record.seq) + "\n")

fasta_in.close()
fasta_out.close()
